In [1]:
import glob
import os
import random
import subprocess
import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import pybedtools as pbt
import seaborn as sns
import statsmodels.stats.multitest as smm
import vcf as pyvcf
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'rare_variant_eqtls'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
In [2]:
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'lead_variants.tsv')
lead_vars = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls02', 'lead_variants.tsv')
lead_vars2 = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls03', 'lead_variants.tsv')
lead_vars3 = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'unrelated_eqtls01', 'lead_variants.tsv')
unr_lead_vars = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
subject_meta_eqtl = subject_meta.ix[rna_meta_eqtl.subject_id]
rna_meta_eqtl = rna_meta_eqtl.merge(subject_meta_eqtl, left_on='subject_id', right_index=True)
In [3]:
lead_vars = lead_vars[lead_vars.perm_sig]
lead_vars = lead_vars[lead_vars.variant_type == 'snv']
lead_vars2 = lead_vars2[lead_vars2.perm_sig]
lead_vars2 = lead_vars2[lead_vars2.variant_type == 'snv']
lead_vars3 = lead_vars3[lead_vars3.perm_sig]
lead_vars3 = lead_vars3[lead_vars3.variant_type == 'snv']
unr_lead_vars = unr_lead_vars[unr_lead_vars.perm_sig]
unr_lead_vars = unr_lead_vars[unr_lead_vars.variant_type == 'snv']
In [4]:
af_cols = [x for x in lead_vars.columns if '_AF' in x]
lead_vars_af = lead_vars.dropna(subset=af_cols)
lead_vars2_af = lead_vars2.dropna(subset=af_cols)
lead_vars3_af = lead_vars3.dropna(subset=af_cols)
In [5]:
lead_vars[af_cols] = lead_vars[af_cols].fillna(0)
In [6]:
lead_vars_af.AF.hist()
plt.ylabel('Number of variants')
plt.xlabel('Alternate allele frequency');
The allele frequencies from 1,000 Genomes are for the alternate allele, so I need to take care when I define which allele is minor.
In [7]:
a = sum((lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols))
b = sum((lead_vars_af[af_cols] > 1 - 0.005).sum(axis=1) == len(af_cols))
print('There are {} variants where the alternate is rare and {} '
'where the reference is rare.'.format(a, b))
It doesn't seem like any of my eQTL lead variants that are in 1000 Genomes have rare reference alleles, so I can say that rare variants have the alternate allele as the minor allele.
In [8]:
lead_vars_af['rare'] = False
lead_vars_af.ix[lead_vars_af[(lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols)].index, 'rare'] = True
a = lead_vars_af.rare.sum()
b = len(set(lead_vars_af.ix[lead_vars_af.rare, 'gene_id']))
print('There are {} rare lead variants for {} eGenes.'.format(a, b))
In [9]:
se = lead_vars_af[lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')
se = lead_vars_af[lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('All samples')
plt.legend();
In [10]:
af_cols = [x for x in unr_lead_vars.columns if '_AF' in x]
unr_lead_vars_af = unr_lead_vars.dropna(subset=af_cols)
unr_lead_vars_af['rare'] = False
unr_lead_vars_af.ix[unr_lead_vars_af[(unr_lead_vars_af[af_cols] < 0.005).sum(axis=1) ==
len(af_cols)].index, 'rare'] = True
In [11]:
se = unr_lead_vars_af[unr_lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')
se = unr_lead_vars_af[unr_lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('Unrelateds eQTL')
plt.legend();
In [12]:
def get_genotypes(df):
genotypes = pd.DataFrame(np.nan, index=df.index, columns=rna_meta_eqtl.wgs_id)
fn = '/projects/CARDIPS/analysis/cardips-ipsc-eqtl/private_output/eqtl_input/filtered_all/0000.vcf.gz'
vcf_reader = pyvcf.VCFReader(open(fn))
count = 0
for i in df.index:
if count % 1000 == 0:
print(count)
res = vcf_reader.fetch(df.ix[i, 'chrom'][3:], df.ix[i, 'end'],
df.ix[i, 'end'])
r = res.next()
hom_refs = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_refs()])
hets = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hets()])
hom_alts = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_alts()])
genotypes.ix[i, hom_refs] = 0
genotypes.ix[i, hets] = 1
genotypes.ix[i, hom_alts] = 2
count += 1
return genotypes
In [13]:
out = os.path.join(private_outdir, 'genotypes.tsv')
if not os.path.exists(out):
genotypes = get_genotypes(lead_vars_af)
genotypes.to_csv(out, sep='\t')
else:
genotypes = pd.read_table(out, index_col=0)
out = os.path.join(private_outdir, 'genotypes2.tsv')
if not os.path.exists(out):
genotypes2 = get_genotypes(lead_vars2_af)
genotypes2.to_csv(out, sep='\t')
else:
genotypes2 = pd.read_table(out, index_col=0)
out = os.path.join(private_outdir, 'genotypes3.tsv')
if not os.path.exists(out):
genotypes3 = get_genotypes(lead_vars3_af)
genotypes3.to_csv(out, sep='\t')
else:
genotypes3 = pd.read_table(out, index_col=0)
In [14]:
genotypes_f = genotypes.ix[lead_vars_af.index, rna_meta_eqtl.wgs_id]
In [15]:
lead_vars_af['ref_is_major'] = lead_vars_af.ac < lead_vars_af.ns
In [16]:
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');
In [17]:
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');
plt.xlim(0, 0.05)
Out[17]:
In [18]:
lead_vars_uniq = lead_vars.drop_duplicates(subset='location')
lead_vars_uniq['minor_AF'] = lead_vars_uniq.AF
lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF'] = \
1 - lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF']
lead_vars_uniq['ref_is_major'] = lead_vars_uniq.ac < lead_vars_uniq.ns
In [24]:
a = (lead_vars_uniq.AF == 0).value_counts()[False]
b = len(set(lead_vars_uniq.gene_id))
print('Obtained allele frequency for {:,} lead SNVs from {:,} eGenes from 1000 Genomes.\n'.format(a, b))
a = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].gene_id))
nr = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].location))
print('{:,} lead SNVs from {:,} eGenes were rare (MAF < 0.5%) in 1000 Genomes.'.format(nr, a))
n = (lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].maf > 0.05).value_counts()[True]
print('{:,} of these {:,} lead SNVs ({:.1f}%) are not rare among the 215 donors.'.format(n, nr, 100 * n / float(nr)))
a = (lead_vars_uniq.AF == 0).value_counts()[True]
print('{:,} of these {:,} lead SNVs were not seen in 1000 Genomes.'.format(a, nr))
a = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) &
(lead_vars_uniq.minor_AF > 0)].gene_id))
b = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) &
(lead_vars_uniq.minor_AF > 0)].location))
print('{:,} of these {:,} lead SNVs from {:,} eGenes were observed in 1000 Genomes but were '
'rare (MAF < 0.5%).\n'.format(b, nr, a))
In [25]:
jg = sns.jointplot(lead_vars_uniq.maf, lead_vars_uniq.minor_AF, alpha=0.2, stat_func=None);
In [48]:
lead_vars_uniq.to_csv(os.path.join(outdir, 'lead_vars_uniq.tsv'), sep='\t')
In [29]:
wgs_meta = subject_meta.copy(deep=True)
wgs_meta = wgs_meta.ix[rna_meta[rna_meta.in_eqtl].subject_id]
se = pd.Series(rna_meta[rna_meta.in_eqtl].wgs_id.values, index=rna_meta[rna_meta.in_eqtl].subject_id)
wgs_meta.index = se[list(wgs_meta.index)].values
In [30]:
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_uniq.ac.min()]
In [31]:
for i in family_vc.index:
se = genotypes.ix[lead_vars_uniq.index,
rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
c = '{}_maf'.format(i)
lead_vars_uniq[c] = np.nan
lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values] / (family_vc[i] * 2)
lead_vars_uniq.ix[lead_vars_uniq.ref_is_major == False, c] = \
(family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values]) / (family_vc[i] * 2)
c = '{}_mac'.format(i)
lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values]
lead_vars_uniq.ix[(lead_vars_uniq.ref_is_major == False).values, c] = \
(family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values])
In [38]:
t = lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005]
num_fams = []
for i in t.index:
se = genotypes.ix[i]
mode = se.mode().values[0]
num_fams.append(len(set(wgs_meta.ix[se[se != mode].index, 'family_id'])))
num_fams = pd.Series(num_fams)
a = sum(num_fams >= 2)
print('{:,} of {:,} rare lead variants were observed in more than one family.'.format(a, t.shape[0]))
In [47]:
ax = num_fams.value_counts().sort_index().plot.bar()
ax.set_ylabel('Number of lead SNVs')
ax.set_xlabel('Number of families observed in');
In [49]:
num_fams.to_csv(os.path.join(outdir, 'rare_num_fams.tsv'), sep='\t')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_af.ac.min()]
In [21]:
for i in family_vc.index:
se = genotypes_f[rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
c = '{}_maf'.format(i)
lead_vars_af[c] = np.nan
lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major] / (family_vc[i] * 2)
lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
(family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False]) / (family_vc[i] * 2)
c = '{}_mac'.format(i)
lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major]
lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
(family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False])
In [22]:
c = '{}_maf'.format(family_vc.index[1])
t = lead_vars_af[lead_vars_af[c] > lead_vars_af.maf]
t = t[t.rare]
t = t[t.maf < 0.05]
In [23]:
plt.scatter(t.maf, t[c])
Out[23]:
In [24]:
for i in family_vc.index:
c = '{}_mac'.format(i)
a = set(subject_meta.ix[subject_meta.family_id == i, 'ethnicity_group'])
print(i, len(set(lead_vars_af.ix[lead_vars_af.ac == lead_vars_af[c], 'gene_id'])), list(a))
In [21]:
subject_meta[subject_meta.family_id == '84fda65d-9a06-4bbe-ad75-a24773724c32'].head()
Out[21]:
In [21]:
lead_vars_af.to_csv(os.path.join(outdir, 'lead_vars_af.tsv'), sep='\t')
unr_lead_vars_af.to_csv(os.path.join(outdir, 'unr_lead_vars_af.tsv'), sep='\t')
In [28]:
tdf = lead_vars_af[lead_vars_af.rare]
In [31]:
plt.scatter(tdf.AF, tdf.beta)
Out[31]:
In [46]:
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['84fda65d-9a06-4bbe-ad75-a24773724c32_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)
Out[46]:
In [47]:
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['abb401f1-5c3e-48ed-8c55-839ce2afe7e6_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)
Out[47]:
TODO: Are any of the lead variants for the second or third eQTLs rare?
In [50]:
sum(lead_vars2.AF < 0.005)
Out[50]:
In [51]:
sum(lead_vars3.AF < 0.005)
Out[51]:
In [ ]: